# Author: Mark Richardson
# Date started: 03/03/2021
# Date finished: 03/03/2021
# Date revised: 02/03/2022 
# Purpose: Get ratings and informed priors to estimate performance ratings
# Revision purpose: More efficiently address works_with-agency_rated mapping errors
#                   and remove duplicate ratings (before revision duplicates were
#                   identified but not removed due to an error)

# Date revised: 06/09/2022
# Revision purpose: Keep PID and ideology to explore partisan bias in ratings

# Date revised: 08/16/2022
# Revision purpose: Keep appointee variables to estimate ratings that consider rater characteristics
# Proofed: 11/05/2023

# Date revised: 11/18/2023
# Revision purpose: Keep tenure variables to get mean age of raters
# Proofed: 11/05/2023

# Load packages
library(dplyr)
library(stringr)

# Load 2020 data
load("data/sfgs_2020.RData")

sfgs <- sfgs %>% filter(responded == 1)

#### Format the data ####

# Code -99 as NA

perf <- sfgs %>%
  select(id, dept, office, works_with_1:works_with_3, agency_rated_1:agency_rated_5, perf_rating_1:perf_rating_5) %>%
  mutate(perf_rating_1 = if_else(perf_rating_1 == -99, NA_real_, perf_rating_1),
         perf_rating_2 = if_else(perf_rating_2 == -99, NA_real_, perf_rating_2),
         perf_rating_3 = if_else(perf_rating_3 == -99, NA_real_, perf_rating_3),
         perf_rating_4 = if_else(perf_rating_4 == -99, NA_real_, perf_rating_4),
         perf_rating_5 = if_else(perf_rating_5 == -99, NA_real_, perf_rating_5))

# Remove extra space due to encoding issue between Java and Qualtircs - Â

perf <- perf %>%
  mutate(agency_rated_1 = str_squish(agency_rated_1),
         agency_rated_2 = str_squish(agency_rated_2),
         agency_rated_3 = str_squish(agency_rated_3),
         agency_rated_4 = str_squish(agency_rated_4),
         agency_rated_5 = str_squish(agency_rated_5))

# Rescale ratings to be N(0,1)

mn <- mean(perf %>% select(perf_rating_1:perf_rating_5) %>% as.matrix(), na.rm = TRUE)

sd <- sd(perf %>% select(perf_rating_1:perf_rating_5) %>% as.matrix(), na.rm = TRUE)

perf <- perf %>%
  rename(perf_rating_1_org = perf_rating_1,
         perf_rating_2_org = perf_rating_2,
         perf_rating_3_org = perf_rating_3,
         perf_rating_4_org = perf_rating_4,
         perf_rating_5_org = perf_rating_5) %>%
  mutate(perf_rating_1 = (perf_rating_1_org - mn) / sd,
         perf_rating_2 = (perf_rating_2_org - mn) / sd,
         perf_rating_3 = (perf_rating_3_org - mn) / sd,
         perf_rating_4 = (perf_rating_4_org - mn) / sd,
         perf_rating_5 = (perf_rating_5_org - mn) / sd)

# Confirm rescaling
mn <- mean(perf %>% select(perf_rating_1:perf_rating_5) %>% as.matrix(), na.rm = TRUE)

sd <- sd(perf %>% select(perf_rating_1:perf_rating_5) %>% as.matrix(), na.rm = TRUE)

rm(mn, sd)

#### Deal with ww-agency_rated mapping errors ####

# Two possible errors
# 1. Skipping a works_with item and answering a subsequent item will cause Java coding to place subsequent
#    ww as earlier rated agency, e.g., skip ww_1 and provide ww_2, then ww_2 will be agency_rated_1.
# 2. Selecting the same agency for multiple ww responses will cause that agency to appear in corresponding
#    agency_rated slots if the respondent is using Internet Explorer. Otherwise, Java code removes the duplicates;

# Note: If OMB or OPM are selected as a works-with agency, Internet explorer mapping displays OMB and OPM in
#       corresponding agency_rated slot, not in slots 4 or 5.

# Standardize names between works-with and agency rated

perf <- perf %>%
  mutate(works_with_1 = str_remove(str_remove_all(works_with_1, "\\s\\(All\\)"), "Offices and Bureaus within\\s"),
         works_with_2 = str_remove(str_remove_all(works_with_2, "\\s\\(All\\)"), "Offices and Bureaus within\\s"),
         works_with_3 = str_remove(str_remove_all(works_with_3, "\\s\\(All\\)"), "Offices and Bureaus within\\s")) %>%
  mutate(agency_rated_1 = str_remove(agency_rated_1, "\\s\\(All\\)"),
         agency_rated_2 = str_remove(agency_rated_2, "\\s\\(All\\)"),
         agency_rated_3 = str_remove(agency_rated_3, "\\s\\(All\\)"),
         agency_rated_4 = str_remove(agency_rated_4, "\\s\\(All\\)"),
         agency_rated_5 = str_remove(agency_rated_5, "\\s\\(All\\)"))

# Identify works_with-agency_rated name mismatches

ww_names <- tibble(agency = na.omit(unique(c(perf$works_with_1,
                                             perf$works_with_2,
                                             perf$works_with_3))))

ar_names <- tibble(agency = na.omit(unique(c(perf$agency_rated_1,
                                             perf$agency_rated_2,
                                             perf$agency_rated_3,
                                             perf$agency_rated_4,
                                             perf$agency_rated_5))))

no_match <- anti_join(ww_names, ar_names, by = "agency") # Only no-match is -99

rm(ww_names, ar_names, no_match)

#### Error 1 ####

skip_1 <- perf %>%
  filter(works_with_1 == "-99" & (works_with_2 != "-99" | works_with_3 != "-99")) # One case

skip_2 <- perf %>%
  filter(works_with_2 == "-99" & works_with_3 != "-99") # None

perf <- perf %>%
  mutate(skip_1 = if_else(works_with_1 == "-99" & works_with_2 != "-99", 1, 0)) %>% # Create indicator variable for the single case
  mutate(works_with_1 = if_else(skip_1 == 1, works_with_2, works_with_1), # Recode ww_1
         works_with_2 = if_else(skip_1 == 1, NA_character_, works_with_2)) # Recode ww_2 to missing to prevent appearance of duplicate rating

# Confirm fix
perf %>% filter(skip_1 == 1)

# Drop skip_1 variable
perf <- perf %>% select(!skip_1)

# clean up
rm(skip_1, skip_2)

#### Error 2 ####

# Identify duplicates that cause an error

perf <- perf %>% 
  mutate(ww_dup_123 = if_else((works_with_1 == works_with_2 & works_with_2 == works_with_3 & works_with_1 != "-99") &
                                (agency_rated_1 == agency_rated_2 & agency_rated_2 == agency_rated_3),
                               1, 0),
         ww_dup_12 = if_else((works_with_1 == works_with_2 & works_with_1 != "-99") &
                               (agency_rated_1 == agency_rated_2) & ww_dup_123 == 0,
                              1, 0),
         ww_dup_13 = if_else((works_with_1 == works_with_3 & works_with_1 != "-99") &
                               (agency_rated_1 == agency_rated_3) & ww_dup_123 == 0,
                             1, 0),
         ww_dup_23 = if_else((works_with_2 == works_with_3 & works_with_2 != "-99") &
                               (agency_rated_2 == agency_rated_3) & ww_dup_123 == 0,
                             1, 0))

# Determine if all duplicates rated
# Conclusion: No duplicated cases with first duplicated rating missing and other ratings non-missing
# Okay to keep only the first duplicated rating

# 123
nrow(perf %>% filter(ww_dup_123 == 1) %>%
       filter(is.na(perf_rating_1) & (!is.na(perf_rating_2) | !is.na(perf_rating_3))))

# 12
nrow(perf %>% filter(ww_dup_12 == 1) %>%
       filter(is.na(perf_rating_1) & !is.na(perf_rating_2)))

# 13
nrow(perf %>% filter(ww_dup_13 == 1) %>%
       filter(is.na(perf_rating_1) & !is.na(perf_rating_3)))

# 23
nrow(perf %>% filter(ww_dup_23 == 1) %>%
       filter(is.na(perf_rating_2) & !is.na(perf_rating_3)))

ww <- perf %>% filter(ww_dup_123 == 1 |
                        ww_dup_12 == 1 |
                        ww_dup_13 == 1 |
                        ww_dup_23 == 1 ) %>%
  arrange(ww_dup_123, ww_dup_12, ww_dup_13, ww_dup_23, is.na(perf_rating_1))

# Remove the duplicates

# Second ww and agency_rated (123 or 12)
perf <- perf %>%
  mutate(works_with_2 = if_else(ww_dup_123 == 1 | ww_dup_12 == 1, NA_character_, works_with_2),
         agency_rated_2 = if_else(ww_dup_123 == 1 | ww_dup_12 == 1, NA_character_, agency_rated_2),
         perf_rating_2_org = if_else(ww_dup_123 == 1 | ww_dup_12 == 1, NA_real_, perf_rating_2_org),
         perf_rating_2 = if_else(ww_dup_123 == 1 | ww_dup_12 == 1, NA_real_, perf_rating_2))

# Third ww and agency_rated (123 or 13 or 23)
perf <- perf %>%
  mutate(works_with_3 = if_else(ww_dup_123 == 1 | ww_dup_13 == 1 | ww_dup_23 == 1,
                                NA_character_, works_with_3),
         agency_rated_3 = if_else(ww_dup_123 == 1 | ww_dup_13 == 1 | ww_dup_23 == 1,
                                  NA_character_, agency_rated_3),
         perf_rating_3_org = if_else(ww_dup_123 == 1 | ww_dup_13 == 1 | ww_dup_23 == 1,
                                     NA_real_, perf_rating_3_org),
         perf_rating_3 = if_else(ww_dup_123 == 1 | ww_dup_13 == 1 | ww_dup_23 == 1,
                                 NA_real_, perf_rating_3))

# Check recoding
perf %>% 
  mutate(ww_dup_123 = if_else((works_with_1 == works_with_2 & works_with_2 == works_with_3 & works_with_1 != "-99") &
                                (agency_rated_1 == agency_rated_2 & agency_rated_2 == agency_rated_3),
                              1, 0),
         ww_dup_12 = if_else((works_with_1 == works_with_2 & works_with_1 != "-99") &
                               (agency_rated_1 == agency_rated_2) & ww_dup_123 == 0,
                             1, 0),
         ww_dup_13 = if_else((works_with_1 == works_with_3 & works_with_1 != "-99") &
                               (agency_rated_1 == agency_rated_3) & ww_dup_123 == 0,
                             1, 0),
         ww_dup_23 = if_else((works_with_2 == works_with_3 & works_with_2 != "-99") &
                               (agency_rated_2 == agency_rated_3) & ww_dup_123 == 0,
                             1, 0)) %>%
  rowwise() %>%
  summarise(count = sum(c_across(ww_dup_123:ww_dup_23))) %>%
  ungroup() %>%
  summarize(count = sum(count, na.rm = TRUE))

rm(ww)

#### Get informed ratings ####

perf_inf <- list()

perf_inf[[1]] <- perf %>%
  filter(works_with_1 == agency_rated_1) %>%
  select(works_with_1, agency_rated_1, perf_rating_1) %>%
  rename(works_with = works_with_1, agency_rated = agency_rated_1, perf_rating = perf_rating_1)

perf_inf[[2]] <- perf %>%
  filter(works_with_2 == agency_rated_2) %>%
  select(works_with_2, agency_rated_2, perf_rating_2) %>%
  rename(works_with = works_with_2, agency_rated = agency_rated_2, perf_rating = perf_rating_2)

perf_inf[[3]] <- perf %>%
  filter(works_with_3 == agency_rated_3) %>%
  select(works_with_3, agency_rated_3, perf_rating_3) %>%
  rename(works_with = works_with_3, agency_rated = agency_rated_3, perf_rating = perf_rating_3)

perf_inf <- bind_rows(perf_inf)

#### Get informed priors ####

perf_inf_priors <- perf_inf %>%
  filter(!is.na(perf_rating)) %>%
  group_by(agency_rated) %>%
  summarize(n = n(),
            perf_inf_mean = mean(perf_rating),
            perf_inf_var = var(perf_rating))

#### Get ratings and workplace ####

perf_ratings <- perf %>%
  select(id, dept, office, agency_rated_1:perf_rating_5)

# Add PID, individual ideology, appointee, and tenure

rater_char <- sfgs %>%
  select(id, pid_3, pid_3_txt, pid_probe, ideology, app, yrs_agency, yrs_fed_gov)

perf_ratings <- full_join(perf_ratings, rater_char, by = "id")

# Create data frame of respondents with at least one rating
perf_1 <- perf_ratings %>%
  filter(!is.na(perf_rating_1_org) |
           !is.na(perf_rating_2_org) |
           !is.na(perf_rating_3_org) |
           !is.na(perf_rating_4_org) |
           !is.na(perf_rating_5_org) ) %>%
  mutate(yrs_agency =  if_else(yrs_agency  == 98, NA_real_, yrs_agency),
         yrs_fed_gov = if_else(yrs_fed_gov == 98, NA_real_, yrs_fed_gov))
  
summary(perf_1$yrs_agency)
summary(perf_1$yrs_fed_gov)

# Clean up
rm(rater_char, perf_1)

perf_ratings <- perf_ratings %>%
  select(!yrs_agency:yrs_fed_gov)

#### Save the ratings and informed priors ####

save(perf_ratings, perf_inf_priors, file = "data/ratings/performance_ratings/01_performance_ratings_for_model.RData")
